Residency- 4 : Unsupervised Learning Assignment : project 2
#Lets add libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#Lets import the data first
dfCars = pd.read_csv("vehicle-1.csv")
dfCars.info()
dfCars.shape
dfCars.head(10)
#Lets check if there are any duplicate records
duplicate = dfCars.duplicated()
print(duplicate.sum())
dfCars.isna().sum()
The output shown above suggests that there are null values in multiple features.
dfCars1 = dfCars
df = dfCars1.dropna()
sns.pairplot(pd.DataFrame(df[['circularity','class']]), hue='class', diag_kind='kde')
df.skew()
df.kurtosis()
dfCars1['circularity'] = np.where(
(dfCars1['circularity'].isnull())
&
(dfCars1['class']=='car'),
df[df['class']=='car']['circularity'].mean(), # dataframe having no null values
dfCars1['circularity']
)
dfCars1['circularity'] = np.where(
(dfCars1['circularity'].isnull())
&
(dfCars1['class']=='bus'),
df[df['class']=='bus']['circularity'].mean(),
dfCars1['circularity']
)
dfCars1['circularity'] = np.where(
(dfCars1['circularity'].isnull())
&
(dfCars1['class']=='van'),
df[df['class']=='van']['circularity'].mean(),
dfCars1['circularity']
)
df[df['class']=='van']['circularity'].mean()
pd.DataFrame(dfCars1['circularity']).head(6)
dfCars1[(dfCars1['circularity'].isnull())]
sns.pairplot(pd.DataFrame(dfCars1[['circularity','class']]), hue='class', diag_kind='kde')
dfCars1.skew()
dfCars1.kurtosis()
Above steps varifies that data ic correctly imputed as expected or as per step we expect
#Let do this for other feature
dfCars1['distance_circularity'] = np.where(
(dfCars1['distance_circularity'].isnull())
&
(dfCars1['class']=='car'),
df[df['class']=='car']['distance_circularity'].mean(), # dataframe having no null values
dfCars1['distance_circularity']
)
dfCars1['distance_circularity'] = np.where(
(dfCars1['distance_circularity'].isnull())
&
(dfCars1['class']=='bus'),
df[df['class']=='bus']['distance_circularity'].mean(), # dataframe having no null values
dfCars1['distance_circularity']
)
dfCars1['distance_circularity'] = np.where(
(dfCars1['distance_circularity'].isnull())
&
(dfCars1['class']=='van'),
df[df['class']=='van']['distance_circularity'].mean(), # dataframe having no null values
dfCars1['distance_circularity']
)
dfCars1[(dfCars1['distance_circularity'].isnull())]
dfCars1['radius_ratio'] = np.where(
(dfCars1['radius_ratio'].isnull())
&
(dfCars1['class']=='car'),
df[df['class']=='car']['radius_ratio'].mean(), # dataframe having no null values
dfCars1['radius_ratio']
)
dfCars1['radius_ratio'] = np.where(
(dfCars1['radius_ratio'].isnull())
&
(dfCars1['class']=='bus'),
df[df['class']=='bus']['radius_ratio'].mean(), # dataframe having no null values
dfCars1['radius_ratio']
)
dfCars1['radius_ratio'] = np.where(
(dfCars1['radius_ratio'].isnull())
&
(dfCars1['class']=='van'),
df[df['class']=='van']['radius_ratio'].mean(), # dataframe having no null values
dfCars1['radius_ratio']
)
dfCars1[(dfCars1['radius_ratio'].isnull())]
dfCars1['pr.axis_aspect_ratio'] = np.where(
(dfCars1['pr.axis_aspect_ratio'].isnull())
&
(dfCars1['class']=='car'),
df[df['class']=='car']['pr.axis_aspect_ratio'].mean(), # dataframe having no null values
dfCars1['pr.axis_aspect_ratio']
)
dfCars1['pr.axis_aspect_ratio'] = np.where(
(dfCars1['pr.axis_aspect_ratio'].isnull())
&
(dfCars1['class']=='bus'),
df[df['class']=='bus']['pr.axis_aspect_ratio'].mean(), # dataframe having no null values
dfCars1['pr.axis_aspect_ratio']
)
dfCars1['pr.axis_aspect_ratio'] = np.where(
(dfCars1['pr.axis_aspect_ratio'].isnull())
&
(dfCars1['class']=='van'),
df[df['class']=='van']['pr.axis_aspect_ratio'].mean(), # dataframe having no null values
dfCars1['pr.axis_aspect_ratio']
)
dfCars1['scatter_ratio'] = np.where(
(dfCars1['scatter_ratio'].isnull())
&
(dfCars1['class']=='car'),
df[df['class']=='car']['scatter_ratio'].mean(), # dataframe having no null values
dfCars1['scatter_ratio']
)
dfCars1['scatter_ratio'] = np.where(
(dfCars1['scatter_ratio'].isnull())
&
(dfCars1['class']=='bus'),
df[df['class']=='bus']['scatter_ratio'].mean(), # dataframe having no null values
dfCars1['scatter_ratio']
)
dfCars1['scatter_ratio'] = np.where(
(dfCars1['scatter_ratio'].isnull())
&
(dfCars1['class']=='van'),
df[df['class']=='van']['scatter_ratio'].mean(), # dataframe having no null values
dfCars1['scatter_ratio']
)
dfCars1['elongatedness'] = np.where(
(dfCars1['elongatedness'].isnull())
&
(dfCars1['class']=='car'),
df[df['class']=='car']['elongatedness'].mean(), # dataframe having no null values
dfCars1['elongatedness']
)
dfCars1['elongatedness'] = np.where(
(dfCars1['elongatedness'].isnull())
&
(dfCars1['class']=='bus'),
df[df['class']=='bus']['elongatedness'].mean(), # dataframe having no null values
dfCars1['elongatedness']
)
dfCars1['elongatedness'] = np.where(
(dfCars1['elongatedness'].isnull())
&
(dfCars1['class']=='van'),
df[df['class']=='van']['elongatedness'].mean(), # dataframe having no null values
dfCars1['elongatedness']
)
dfCars1['pr.axis_rectangularity'] = np.where(
(dfCars1['pr.axis_rectangularity'].isnull())
&
(dfCars1['class']=='car'),
df[df['class']=='car']['pr.axis_rectangularity'].mean(), # dataframe having no null values
dfCars1['pr.axis_rectangularity']
)
dfCars1['pr.axis_rectangularity'] = np.where(
(dfCars1['pr.axis_rectangularity'].isnull())
&
(dfCars1['class']=='bus'),
df[df['class']=='bus']['pr.axis_rectangularity'].mean(), # dataframe having no null values
dfCars1['pr.axis_rectangularity']
)
dfCars1['pr.axis_rectangularity'] = np.where(
(dfCars1['pr.axis_rectangularity'].isnull())
&
(dfCars1['class']=='van'),
df[df['class']=='van']['pr.axis_rectangularity'].mean(), # dataframe having no null values
dfCars1['pr.axis_rectangularity']
)
dfCars1['scaled_variance'] = np.where(
(dfCars1['scaled_variance'].isnull())
&
(dfCars1['class']=='car'),
df[df['class']=='car']['scaled_variance'].mean(), # dataframe having no null values
dfCars1['scaled_variance']
)
dfCars1['scaled_variance'] = np.where(
(dfCars1['scaled_variance'].isnull())
&
(dfCars1['class']=='bus'),
df[df['class']=='bus']['scaled_variance'].mean(), # dataframe having no null values
dfCars1['scaled_variance']
)
dfCars1['scaled_variance'] = np.where(
(dfCars1['scaled_variance'].isnull())
&
(dfCars1['class']=='van'),
df[df['class']=='van']['scaled_variance'].mean(), # dataframe having no null values
dfCars1['scaled_variance']
)
dfCars1['scaled_variance.1'] = np.where(
(dfCars1['scaled_variance.1'].isnull())
&
(dfCars1['class']=='car'),
df[df['class']=='car']['scaled_variance.1'].mean(), # dataframe having no null values
dfCars1['scaled_variance.1']
)
dfCars1['scaled_variance.1'] = np.where(
(dfCars1['scaled_variance.1'].isnull())
&
(dfCars1['class']=='bus'),
df[df['class']=='bus']['scaled_variance.1'].mean(), # dataframe having no null values
dfCars1['scaled_variance.1']
)
dfCars1['scaled_variance.1'] = np.where(
(dfCars1['scaled_variance.1'].isnull())
&
(dfCars1['class']=='van'),
df[df['class']=='van']['scaled_variance.1'].mean(), # dataframe having no null values
dfCars1['scaled_variance.1']
)
dfCars1['scaled_radius_of_gyration'] = np.where(
(dfCars1['scaled_radius_of_gyration'].isnull())
&
(dfCars1['class']=='car'),
df[df['class']=='car']['scaled_radius_of_gyration'].mean(), # dataframe having no null values
dfCars1['scaled_radius_of_gyration']
)
dfCars1['scaled_radius_of_gyration'] = np.where(
(dfCars1['scaled_radius_of_gyration'].isnull())
&
(dfCars1['class']=='bus'),
df[df['class']=='bus']['scaled_radius_of_gyration'].mean(), # dataframe having no null values
dfCars1['scaled_radius_of_gyration']
)
dfCars1['scaled_radius_of_gyration'] = np.where(
(dfCars1['scaled_radius_of_gyration'].isnull())
&
(dfCars1['class']=='van'),
df[df['class']=='van']['scaled_radius_of_gyration'].mean(), # dataframe having no null values
dfCars1['scaled_radius_of_gyration']
)
dfCars1['scaled_radius_of_gyration.1'] = np.where(
(dfCars1['scaled_radius_of_gyration.1'].isnull())
&
(dfCars1['class']=='car'),
df[df['class']=='car']['scaled_radius_of_gyration.1'].mean(), # dataframe having no null values
dfCars1['scaled_radius_of_gyration.1']
)
dfCars1['scaled_radius_of_gyration.1'] = np.where(
(dfCars1['scaled_radius_of_gyration.1'].isnull())
&
(dfCars1['class']=='bus'),
df[df['class']=='bus']['scaled_radius_of_gyration.1'].mean(), # dataframe having no null values
dfCars1['scaled_radius_of_gyration.1']
)
dfCars1['scaled_radius_of_gyration.1'] = np.where(
(dfCars1['scaled_radius_of_gyration.1'].isnull())
&
(dfCars1['class']=='van'),
df[df['class']=='van']['scaled_radius_of_gyration.1'].mean(), # dataframe having no null values
dfCars1['scaled_radius_of_gyration.1']
)
dfCars1['skewness_about'] = np.where(
(dfCars1['skewness_about'].isnull())
&
(dfCars1['class']=='car'),
df[df['class']=='car']['skewness_about'].mean(), # dataframe having no null values
dfCars1['skewness_about']
)
dfCars1['skewness_about'] = np.where(
(dfCars1['skewness_about'].isnull())
&
(dfCars1['class']=='bus'),
df[df['class']=='bus']['skewness_about'].mean(), # dataframe having no null values
dfCars1['skewness_about']
)
dfCars1['skewness_about'] = np.where(
(dfCars1['skewness_about'].isnull())
&
(dfCars1['class']=='van'),
df[df['class']=='van']['skewness_about'].mean(), # dataframe having no null values
dfCars1['skewness_about']
)
dfCars1['skewness_about.1'] = np.where(
(dfCars1['skewness_about.1'].isnull())
&
(dfCars1['class']=='car'),
df[df['class']=='car']['skewness_about.1'].mean(), # dataframe having no null values
dfCars1['skewness_about.1']
)
dfCars1['skewness_about.1'] = np.where(
(dfCars1['skewness_about.1'].isnull())
&
(dfCars1['class']=='bus'),
df[df['class']=='bus']['skewness_about.1'].mean(), # dataframe having no null values
dfCars1['skewness_about.1']
)
dfCars1['skewness_about.1'] = np.where(
(dfCars1['skewness_about.1'].isnull())
&
(dfCars1['class']=='van'),
df[df['class']=='van']['skewness_about.1'].mean(), # dataframe having no null values
dfCars1['skewness_about.1']
)
dfCars1['skewness_about.2'] = np.where(
(dfCars1['skewness_about.2'].isnull())
&
(dfCars1['class']=='car'),
df[df['class']=='car']['skewness_about.2'].mean(), # dataframe having no null values
dfCars1['skewness_about.2']
)
dfCars1['skewness_about.2'] = np.where(
(dfCars1['skewness_about.2'].isnull())
&
(dfCars1['class']=='bus'),
df[df['class']=='bus']['skewness_about.2'].mean(), # dataframe having no null values
dfCars1['skewness_about.2']
)
dfCars1['skewness_about.2'] = np.where(
(dfCars1['skewness_about.2'].isnull())
&
(dfCars1['class']=='van'),
df[df['class']=='van']['skewness_about.2'].mean(), # dataframe having no null values
dfCars1['skewness_about.2']
)
dfCars1.isna().sum()
#Lets check Outliers using box plot
dfCars1.boxplot(figsize=(20,3))
Above box plot shows outliers in 8 fatures- radius_ratio, pr.axis_aspect_ratio, max.length_aspect_ratio, scaled_variance, scaled_variance.1, scaled_radius_of_gyration.1, skewness_about, skewness_about.1, skewness_about.2
dfCars1[['radius_ratio', 'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scaled_variance', 'scaled_variance.1',
'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1']].boxplot(figsize=(20,3))
#Lets impute values for outliers
dfCars1.describe()
dfCars1[['radius_ratio']].boxplot()
dfCars1_backup = dfCars1
dfCars1['radius_ratio'] = np.where((dfCars1['radius_ratio'] > 250) & (dfCars1['class']=='van'),
df[df['class']=='van']['radius_ratio'].mean(),
dfCars1['radius_ratio'])
dfCars1['radius_ratio'] = np.where((dfCars1['radius_ratio'] > 250) & (dfCars1['class']=='bus'),
df[df['class']=='bus']['radius_ratio'].mean(),
dfCars1['radius_ratio'])
dfCars1['radius_ratio'] = np.where((dfCars1['radius_ratio'] > 250) & (dfCars1['class']=='car'),
df[df['class']=='car']['radius_ratio'].mean(),
dfCars1['radius_ratio'])
dfCars1[['pr.axis_aspect_ratio']].boxplot()
dfCars1['pr.axis_aspect_ratio'] = np.where((dfCars1['pr.axis_aspect_ratio'] > 80) & (dfCars1['class']=='van'),
df[df['class']=='van']['pr.axis_aspect_ratio'].mean(),
dfCars1['pr.axis_aspect_ratio'])
dfCars1['pr.axis_aspect_ratio'] = np.where((dfCars1['pr.axis_aspect_ratio'] > 80) & (dfCars1['class']=='bus'),
df[df['class']=='bus']['pr.axis_aspect_ratio'].mean(),
dfCars1['pr.axis_aspect_ratio'])
dfCars1['pr.axis_aspect_ratio'] = np.where((dfCars1['pr.axis_aspect_ratio'] > 80) & (dfCars1['class']=='car'),
df[df['class']=='car']['pr.axis_aspect_ratio'].mean(),
dfCars1['pr.axis_aspect_ratio'])
dfCars1[['max.length_aspect_ratio']].boxplot()
dfCars1['max.length_aspect_ratio'] = np.where((dfCars1['max.length_aspect_ratio'] > 18) & (dfCars1['class']=='van'),
df[df['class']=='van']['max.length_aspect_ratio'].mean(),
dfCars1['max.length_aspect_ratio'])
dfCars1['max.length_aspect_ratio'] = np.where((dfCars1['max.length_aspect_ratio'] > 18) & (dfCars1['class']=='bus'),
df[df['class']=='bus']['max.length_aspect_ratio'].mean(),
dfCars1['max.length_aspect_ratio'])
dfCars1['max.length_aspect_ratio'] = np.where((dfCars1['max.length_aspect_ratio'] > 18) & (dfCars1['class']=='car'),
df[df['class']=='car']['max.length_aspect_ratio'].mean(),
dfCars1['max.length_aspect_ratio'])
dfCars1[['scaled_variance']].boxplot()
dfCars1['scaled_variance'] = np.where((dfCars1['scaled_variance'] > 300) & (dfCars1['class']=='van'),
df[df['class']=='van']['scaled_variance'].mean(),
dfCars1['scaled_variance'])
dfCars1['scaled_variance'] = np.where((dfCars1['scaled_variance'] > 300) & (dfCars1['class']=='bus'),
df[df['class']=='bus']['scaled_variance'].mean(),
dfCars1['scaled_variance'])
dfCars1['scaled_variance'] = np.where((dfCars1['scaled_variance'] > 300) & (dfCars1['class']=='car'),
df[df['class']=='car']['scaled_variance'].mean(),
dfCars1['scaled_variance'])
dfCars1[['scaled_variance.1']].boxplot()
dfCars1['scaled_variance.1'] = np.where((dfCars1['scaled_variance.1'] > 990) & (dfCars1['class']=='van'),
df[df['class']=='van']['scaled_variance.1'].mean(),
dfCars1['scaled_variance.1'])
dfCars1['scaled_variance.1'] = np.where((dfCars1['scaled_variance.1'] > 990) & (dfCars1['class']=='bus'),
df[df['class']=='bus']['scaled_variance.1'].mean(),
dfCars1['scaled_variance.1'])
dfCars1['scaled_variance.1'] = np.where((dfCars1['scaled_variance.1'] > 990) & (dfCars1['class']=='car'),
df[df['class']=='car']['scaled_variance.1'].mean(),
dfCars1['scaled_variance.1'])
dfCars1[['scaled_radius_of_gyration.1']].boxplot()
dfCars1['scaled_radius_of_gyration.1'] = np.where((dfCars1['scaled_radius_of_gyration.1'] > 87) & (dfCars1['class']=='van'),
df[df['class']=='van']['scaled_radius_of_gyration.1'].mean(),
dfCars1['scaled_radius_of_gyration.1'])
dfCars1['scaled_radius_of_gyration.1'] = np.where((dfCars1['scaled_radius_of_gyration.1'] > 87) & (dfCars1['class']=='bus'),
df[df['class']=='bus']['scaled_radius_of_gyration.1'].mean(),
dfCars1['scaled_radius_of_gyration.1'])
dfCars1['scaled_radius_of_gyration.1'] = np.where((dfCars1['scaled_radius_of_gyration.1'] > 87) & (dfCars1['class']=='car'),
df[df['class']=='car']['scaled_radius_of_gyration.1'].mean(),
dfCars1['scaled_radius_of_gyration.1'])
dfCars1[['skewness_about']].boxplot()
dfCars1['skewness_about'] = np.where((dfCars1['skewness_about'] > 19) & (dfCars1['class']=='van'),
df[df['class']=='van']['skewness_about'].mean(),
dfCars1['skewness_about'])
dfCars1['skewness_about'] = np.where((dfCars1['skewness_about'] > 19) & (dfCars1['class']=='bus'),
df[df['class']=='bus']['skewness_about'].mean(),
dfCars1['skewness_about'])
dfCars1['skewness_about'] = np.where((dfCars1['skewness_about'] > 19) & (dfCars1['class']=='car'),
df[df['class']=='car']['skewness_about'].mean(),
dfCars1['skewness_about'])
dfCars1[['skewness_about.1']].boxplot()
dfCars1['skewness_about.1'] = np.where((dfCars1['skewness_about.1'] > 38) & (dfCars1['class']=='van'),
df[df['class']=='van']['skewness_about.1'].mean(),
dfCars1['skewness_about.1'])
dfCars1['skewness_about.1'] = np.where((dfCars1['skewness_about.1'] > 38) & (dfCars1['class']=='bus'),
df[df['class']=='bus']['skewness_about.1'].mean(),
dfCars1['skewness_about.1'])
dfCars1['skewness_about.1'] = np.where((dfCars1['skewness_about.1'] > 38) & (dfCars1['class']=='car'),
df[df['class']=='car']['skewness_about.1'].mean(),
dfCars1['skewness_about.1'])
dfCars1.boxplot()
Outliers fixing is done
fig, ax = plt.subplots(figsize=(10,10)) # Sample figsize in inches
sns.heatmap(dfCars1.corr(), annot=True, linewidths=.5, ax=ax)
sns.pairplot(dfCars1, hue='class', diag_kind='kde')
Understanding Attributes - Based on heat map and pairplot shown above.
1) If we go through diagonallly in pair plot, most of the features do not give clear distinction for 3
categories (van, bus, car) of distribution. Few features like scaled_variance, scaled_varinace.1,
pr.axis_rectangularity, elongatedness, scatter_ratio, max_lenght_aspect_ratio are able to, to some extent,
distinguish between 2 classes.
2) Heat map and pair plot shows
a) There is strong corelationship between compactness, circularity, distance_circularity, radius_ation
max.lenght_aspect_ratio, scatter_ratio, pr.axis_rectangularity, max.length_reactangularity,
scaled_variance, scaled_variance.1, scaled_radius_of_gyration
b) Elongatedness - has strong -ve corelationship with compactness, circularity, distance_circularity, radius_ation
max.lenght_aspect_ratio, scatter_ratio,, max.length_reactangularity, scaled_variance,
scaled_variance.1, scaled_radius_of_gyration
dfCars1_bck = dfCars1
from sklearn.model_selection import train_test_split
from sklearn import metrics
target = dfCars1['class']
features = dfCars1.drop(['class'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(features, target, stratify=target,
test_size=0.3, random_state=10)
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
ScoreDetails ={}
Count =1
#Lets see what linear SVM model gives
svm_model = SVC(C=0.1, kernel='linear', gamma=1)
svm_model.fit(X_train, y_train)
var1= "Accuracy on test data - Linear SVM model - Wihtout Scaling -> "
var2= str(accuracy_score(y_test,svm_model.predict(X_test)))
print(var1 + var2)
ScoreDetails[Count]=var1 + var2
Count=Count+1
print(confusion_matrix(y_test,svm_model.predict(X_test)))
#Lets use non linear models
svm_model1 = SVC(kernel='rbf')
svm_model1.fit(X_train, y_train)
var1 = "Accuracy on test data - Kernel(rbf) based- Without scaling ->"
var2= str(accuracy_score(y_test,svm_model1.predict(X_test)))
print(var1+ var2)
ScoreDetails[Count]=var1 + var2
Count=Count+1
print(confusion_matrix(y_test,svm_model1.predict(X_test)))
svm_model2 = SVC(kernel='poly')
svm_model2.fit(X_train, y_train)
var1= "Accuracy on test data- Kernel(poly) based SVM model-Without scaling -> "
var2= str(accuracy_score(y_test,svm_model2.predict(X_test)))
print(var1+ var2)
ScoreDetails[Count]=var1 + var2
Count=Count+1
print(confusion_matrix(y_test,svm_model2.predict(X_test)))
svm_model3 = SVC(kernel='sigmoid')
svm_model3.fit(X_train, y_train)
var1="Accuracy on test data - Kernal(sigmod) based SVM model - Without Scaling ->"
var2= str(accuracy_score(y_test,svm_model3.predict(X_test)))
print(var1+ var2)
ScoreDetails[Count]=var1 + var2
Count=Count+1
print(confusion_matrix(y_test,svm_model3.predict(X_test)))
#Lets apply sclaing for training and test data and check the results
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_normal = sc.fit_transform(X_train)
X_test_normal = sc.transform(X_test)
normal_svm_model1 = SVC(C=2, kernel='linear', gamma=1)
normal_svm_model1.fit(X_train_normal, y_train)
var1 = "Accuracy on test data - Linear SVM model-Scaled data -> "
var2= str(accuracy_score(y_test,normal_svm_model1.predict(X_test_normal)))
print(var1+ var2)
ScoreDetails[Count]=var1 + var2
Count=Count+1
print(confusion_matrix(y_test,normal_svm_model1.predict(X_test_normal)))
#Lets use non linear models
normal_svm_model2 = SVC(kernel='rbf')
normal_svm_model2.fit(X_train_normal, y_train)
var1= "Accuracy on test data - Kernel(rbf) based SVM model-Scaled data -> "
var2= str(accuracy_score(y_test,normal_svm_model2.predict(X_test_normal)))
print(var1+ var2)
ScoreDetails[Count]=var1 + var2
Count=Count+1
print(confusion_matrix(y_test,normal_svm_model2.predict(X_test_normal)))
It is observed that after scaling data, SVC model (kernel = rbf) is giving huge performance gain (from 50% to 95 %)
from sklearn.decomposition import PCA
pca_model = PCA(n_components=18)
pca_model.fit(X_train_normal)
pcascaledXtrain_allcomp = pca_model.fit_transform(X_train_normal)
pcascaledXtest_allcomp =pca_model.transform(X_test_normal)
x_pca.shape
pca_model.components_
pca_model.explained_variance_
pca_model.explained_variance_ratio_
print("Total variance - " + str((pca_model.explained_variance_).sum()))
print("95 % of Total variance - " + str((pca_model.explained_variance_).sum()*0.95))
((pca_model.explained_variance_)[:7]).sum()
((pca_model.explained_variance_)[:7])
First 7 components capture 95 % of the variance
#Lets create PCA model considering only 7 components
pca_model_new = PCA(n_components=7)
pca_model_new.fit(X_train_normal)
pcascaledXtrain = pca_model_new.fit_transform(X_train_normal)
pcascaledXtest=pca_model_new.transform(X_test_normal)
pca_model_new.components_
pca_model_new.explained_variance_
pca_model_new.explained_variance_ratio_
# PCA components involcinf 7 components only
pca_svm_model = SVC(C=1, kernel='linear')
pca_svm_model.fit(pcascaledXtrain, y_train)
var1= "Accuracy on test data - Linear SVM model-using 7 PCA Components -> "
var2= str(accuracy_score(y_test,pca_svm_model.predict(pcascaledXtest)))
print(var1+ var2)
ScoreDetails[Count]=var1 + var2
Count=Count+1
# PCA components involcinf 7 components only
#Lets use non linear models
pca_svm_model1 = SVC(kernel='rbf')
pca_svm_model1.fit(pcascaledXtrain, y_train)
var1= "Accuracy on test data - Kernel(rbf) based SVM model-using 7 PCA Components-> "
var2= str(accuracy_score(y_test,pca_svm_model1.predict(pcascaledXtest)))
print(var1+ var2)
ScoreDetails[Count]=var1 + var2
Count=Count+1
# PCA components involving all 18 components
pca_svm_model11 = SVC(C=1, kernel='linear')
pca_svm_model11.fit(pcascaledXtrain_allcomp, y_train)
var1= "Accuracy on test data - Linear SVM model-using all PCA Components -> "
var2= str(accuracy_score(y_test,pca_svm_model11.predict(pcascaledXtest_allcomp)))
print(var1+ var2)
ScoreDetails[Count]=var1 + var2
Count=Count+1
# PCA components involving all 18 components
#Lets use non linear models
pca_svm_model12 = SVC(kernel='rbf')
pca_svm_model12.fit(pcascaledXtrain_allcomp, y_train)
var1= "Accuracy on test data - Kernel(rbf) based SVM model-using all PCA Components -> "
var2= str(accuracy_score(y_test,pca_svm_model12.predict(pcascaledXtest_allcomp)))
print(var1+ var2)
ScoreDetails[Count]=var1 + var2
Count=Count+1
for i in ScoreDetails :
print(ScoreDetails[i])
Above details show, accurancy of SVM models using 1) raw data (without scaling) 2) Scaled data 3) Using Principal Components